References:
The term Boosting refers to a family of algorithms which converts weak learner to strong learners.
There are many boosting algorithms which impart additional boost to model’s accuracy. In this tutorial, we’ll learn about the two most commonly used algorithms i.e. Gradient Boosting (GBM) and XGboost.
Generally XGboost is considered more advanced than gbm.
import time
notebook_start_time = time.time()
import bhishan
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
SEED = 100
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
import xgboost as xgb
# six and pickle
import six
import pickle
import joblib
# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# sklearn scalar metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
ifile = '../data/raw/creditcard.csv.zip'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()
target = 'Class'
df[target].value_counts(normalize=True)*100
from sklearn.model_selection import train_test_split
target = 'Class'
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1),
df[target],
test_size=0.2,
random_state=SEED,
stratify=df[target])
ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig,
ser_ytrain_orig,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_orig)
ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
print(df_Xtrain.shape)
df_Xtrain.head()
from bhishan import bp
ax = sns.countplot(df[target])
bp.add_text_barplot(ax)
df[target].value_counts(normalize=True).mul(100)
# 99.8 % transaction are non-fraud
# its extremely imbalanced case.
Parameters:
-------------
max_depth=3
learning_rate=0.1
n_estimators=100
verbosity=1 **NOTE: it print in ipython terminal not in browser
silent=None **deprecated use verbosity
objective='binary:logistic' **for binary classification
booster='gbtree' **use default tree not linear
n_jobs=1 **make this -1
nthread=None **deprecated use n_jobs
gamma=0
min_child_weight=1
max_delta_step=0
subsample=1
colsample_bytree=1
colsample_bylevel=1
colsample_bynode=1
reg_alpha=0
reg_lambda=1
scale_pos_weight=1
base_score=0.5
random_state=0 **use your own random state
seed=None **deprecated use random_state
missing=None
early stopping xgboost official note:
If early stopping occurs, the model will have three additional fields: bst.best_score, bst.best_iteration and bst.best_ntree_limit. Note that xgboost.train() will return a model from the last iteration, not the best one. Example
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_test, y_test)])
# help(XGBClassifier)
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix
time_start = time.time()
# current parameters
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig
Xtx = df_Xtest
ytx = ser_ytest
Xvd = df_Xvalid
yvd = ser_yvalid
# model fit
clf_xgb = XGBClassifier(n_jobs=-1, random_state=SEED,
objective='binary:logistic')
clf_xgb.fit(Xtr, ytr)
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_xgb, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
average = 'binary'
row_eval = ['Xgboost','default, imbalanced',
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average=average),
recall_score(ytx, ypreds, average=average),
f1_score(ytx, ypreds, average=average),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
# confusion matrix
print(confusion_matrix(ytx, ypreds))
print(classification_report(ytx,ypreds))
# feature importance
fig,ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(clf_xgb,ax=ax)
plt.show()
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix
time_start = time.time()
# current parameters
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig
Xtx = df_Xtest
ytx = ser_ytest
Xvd = df_Xvalid
yvd = ser_yvalid
# model fit
# early stopping
clf_xgb = XGBClassifier(n_jobs=-1, random_state=SEED,
objective='binary:logistic')
clf_xgb.fit(Xtr,ytr,
eval_set=[(Xvd,yvd)],
eval_metric='auc',
early_stopping_rounds=30, # early stopping gives
)
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_xgb, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
average = 'binary'
row_eval = ['Xgboost','earlystop 30, imbalanced',
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average=average),
recall_score(ytx, ypreds, average=average),
f1_score(ytx, ypreds, average=average),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
# confusion matrix
print(confusion_matrix(ytx, ypreds))
print(classification_report(ytx,ypreds))
# feature importance
fig,ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(clf_xgb,ax=ax)
plt.show()
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
clf_xgb = XGBClassifier(n_jobs=-1, random_state=SEED,
objective='binary:logistic')
# define grid
weights = [1, 99.8, 1000]
param_grid = dict(scale_pos_weight=weights)
# cross validation
cv = RepeatedStratifiedKFold(n_splits=5,
n_repeats=2,
random_state=SEED)
# define grid search
grid = GridSearchCV(estimator=clf_xgb,
param_grid=param_grid,
n_jobs=-1,
cv=cv,
scoring='roc_auc',
verbose=1
)
"""
# execute the grid search
grid_result = grid.fit(Xtr, ytr)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_,
grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
""";
Important Parameters:
Regularization parameters:
import optuna
optuna.logging.set_verbosity(optuna.logging.INFO)
# use INFO to see progress
from xgboost import XGBClassifier
clf_xgb = XGBClassifier(random_state=SEED,num_eval=10,n_estimators=10)
clf_xgb.fit(df_Xtrain,ser_ytrain)
ypreds = clf_xgb.predict(df_Xtest)
score = roc_auc_score(ser_ytest.to_numpy().ravel(),
ypreds)
print(score)
def objective(trial):
params_xgb_optuna = {
'eval_metric': 'auc',
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1.0),
'max_depth':trial.suggest_int('max_depth', 5, 20),
'n_estimators': trial.suggest_int('n_estimators', 150, 1000),
'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 100.0),
'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
}
clf_xgb = XGBClassifier(random_state=SEED,scale_pos_weight=0.98, **params_xgb_optuna)
clf_xgb.fit(df_Xtrain,ser_ytrain)
ypreds = clf_xgb.predict(df_Xvalid)
score = roc_auc_score(ser_yvalid.to_numpy().ravel(),
ypreds)
return score
# NOTE: there is inherent non-determinism in optuna hyperparameter selection
# we may not get the same hyperparameters when run twice.
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large
optuna_storage = 'sqlite:///xgb_optuna_fraud_classifcation.db'
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='xgb_optuna',
storage=optuna_storage,
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS)
# Resume hyper parameter from last state
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='xgb_optuna',
storage=optuna_storage,
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS,timeout=600)
print(f'Number of finished trials: {len(study.trials)}')
# best trail
best_trial = study.best_trial
# best params
params_best = study.best_trial.params
params_best
We need plotly 4 to render visulization in jupyter lab.
from optuna.visualization import (plot_contour,
plot_optimization_history,
plot_parallel_coordinate,
plot_slice)
params=['learning_rate','max_depth',
'n_estimators','subsample','reg_alpha','reg_lambda']
optuna.visualization.plot_contour(study,params=params)
plot_optimization_history(study)
plot_parallel_coordinate(study)
plot_slice(study)
# time
time_start = time.time()
model_name = 'xgboost'
desc = 'grid search optuna'
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()
# use best model
params_best = study.best_trial.params
clf_lgb = clf_lgb = xgb.XGBClassifier(random_state=SEED)
clf_lgb.set_params(**params_best)
# fit and save the model
clf_lgb.fit(Xtr, ytr)
joblib.dump(clf_lgb,'../outputs/clf_xgb_grid_search_optuna.pkl')
# load the saved model
clf_lgb = joblib.load('../outputs/clf_xgb_grid_search_optuna.pkl')
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_lgb, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average=average),
recall_score(ytx, ypreds, average=average),
f1_score(ytx, ypreds, average=average),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
df_eval.sort_values('Recall',ascending=False).style.background_gradient(subset='Recall')
cm = confusion_matrix(ytest,ypreds)
vals = cm.ravel()
print(cm)
print('xgboost Grid Search Results')
print('-'*25)
print('Total Frauds: ', vals[2] + vals[3])
print('Incorrect Frauds: ', vals[2])
print('Incorrect Percent: ', round(vals[2]*100/(vals[2]+vals[3]),2),'%')
from bhishan.bp import plotly_binary_clf_evaluation
yprobs = clf_xgb.predict_proba(df_Xtest)
yprobs = yprobs[:,0] # take only first column
plotly_binary_clf_evaluation('clf_lgb_optuna',clf_xgb,ytx,ypreds,yprobs,df)
# feature importance
fig,ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(clf_xgb,ax=ax)
plt.show()
df.head(2)
df.head(2)
import eli5
eli5.show_weights(clf_xgb) # KeyError: 'bias'
from eli5.sklearn import PermutationImportance
feature_names = df_Xtrain.columns.tolist()
perm = PermutationImportance(clf_xgb).fit(df_Xtest, ytx)
eli5.show_weights(perm, feature_names=feature_names)
from eli5 import show_prediction
show_prediction(clf_xgb, df_Xtest.iloc[0,:],
show_feature_values=True)
import shap
shap.initjs()
clf_xgb
params_xgb = dict(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, n_estimators=10,
n_jobs=1, nthread=-1, num_eval=10, objective='binary:logistic',
random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
seed=SEED, silent=True, subsample=1)
feature_names = df_Xtrain.columns.tolist()
dtrain = xgb.DMatrix(df_Xtrain, ser_ytrain,feature_names=feature_names)
dvalid = xgb.DMatrix(df_Xvalid, ser_yvalid,feature_names=feature_names)
dtest = xgb.DMatrix(df_Xtest,feature_names=feature_names)
n_rounds = 100
watchlist = [(dtrain, 'dtrain'), (dvalid, 'dvalid')]
booster_model = xgb.train(params_xgb,
dtrain,
n_rounds,
watchlist,
verbose_eval=10,
early_stopping_rounds=20)
ypreds = booster_model.predict(dtest)
%%time
explainer = shap.TreeExplainer(booster_model)
shap_values = explainer.shap_values(dtest)
shap_values
max_display = 30
shap.summary_plot(shap_values, df_Xtest, plot_type="bar",
max_display = max_display)
shap.summary_plot(shap_values, df_Xtest, plot_type='dot', max_display = max_display)
# Replicate Shap Importance Chart
import pprint
df_shap = pd.DataFrame(shap_values, columns = df_Xtrain.columns)
df_fimp = df_shap.abs().mean(axis = 0).sort_values(ascending = False)
print(df_fimp.head(max_display))
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
dict_shap = (df_fimp.round(3).iloc[:max_display].to_dict())
num_subplots = len(dict_shap)
for i,v in enumerate(dict_shap):
plt.figure(num=None, figsize=(8, 3*num_subplots),
dpi=80, facecolor='w', edgecolor='k');
ax1 = plt.subplot(num_subplots,1,i+1);
title = f"Dependency Plot for {v.title()}"
title += f" Abs mean Shapeley value = {dict_shap[v]:.2f}"
ax1.set_title(title);
shap.dependence_plot(v, shap_values, df_Xtest, ax = ax1)
plt.tight_layout()
plt.show()
N = 10
df_top_ytrain = ser_ytrain.reset_index().sort_values('Class',ascending=False)\
.head(N).reset_index(drop=True)
df_top_ytrain
# shap_values[:N]
for i,v in df_top_ytrain.iterrows():
case, result = v.to_numpy()
print("CASE {} - Test Fraudulent".format(case))
# plot force plot
display(shap.force_plot(explainer.expected_value,
shap_values[i,:],
df_Xtest.iloc[i,:]))
import yellowbrick
from yellowbrick.target import ClassBalance
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import DiscriminationThreshold
from yellowbrick.classifier import ConfusionMatrix
yellowbrick.__version__
df.head(2)
# load the saved model
clf_xgb = joblib.load('../outputs/clf_xgb_grid_search_optuna.pkl')
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_xgb, df_Xtest, ser_ytest, cv=skf)
ypreds = ypreds_cv
ypreds[:5]
y = df['Class']
labels = ['Non-Fraud','Fraud']
viz = ClassBalance(labels=labels).fit(y)
bp.add_text_barplot(viz.ax)
viz.ax.set_xticks(range(len(labels)))
viz.ax.set_xticklabels(labels,rotation=90);
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest,ypreds)
vals = cm.ravel()
print(cm)
print('xgboost Grid Search Results')
print('-'*25)
print('Total Frauds: ', vals[2] + vals[3])
print('Incorrect Frauds: ', vals[2])
print('Incorrect Percent: ', round(vals[2]*100/(vals[2]+vals[3]),2),'%')
viz = ConfusionMatrix(clf_xgb,classes=labels)
viz.fit(df_Xtrain, ser_ytrain)
viz.score(df_Xtest, ser_ytest)
viz.show()
from yellowbrick.classifier import confusion_matrix as ycm
ycm(clf_xgb,
df_Xtrain, ser_ytrain, df_Xtest, ser_ytest,
classes=['Non-Fraud', 'Fraud']
)
plt.tight_layout();
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(classification_report(ytx,ypreds))
report = ClassificationReport(clf_xgb, size=(1080, 720), labels=labels)
report.score(df_Xtest, ser_ytest)
c = report.poof()
error = ClassPredictionError(clf_xgb, size=(1080, 720), labels=labels)
error.score(df_Xtest, ser_ytest)
e = error.poof()
rocauc = ROCAUC(clf_xgb, size=(1080, 720), labels=labels)
rocauc.score(df_Xtest, ser_ytest)
r = rocauc.poof()
notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))